(###-R script for the evaluation of the estimation method for national level data from aggregated surveys

# Required libraries ------------------------------------------------------
library('memisc') #Import SPSS files
library(plyr) #Data manipulation
library(sampling) #Draw samples
library(car) #Recoding N.B.Needs to be loaded after 'memisc'. Otherwise the 'recode' function gets mixed up
library("SmarterPoland") #access to EUROSTAT data
library(arm) #multi-level data analysis
library(Metrics) #for calculating errors
library(pscl) #for calculating pseudo r-squared measures for glm
library(sorvi) #string strips
library(Kendall) #Kendall's tau
library(ggplot2) #for plots
library(scales) #for formatting percent axes
#function for sums ignoring NAs
sum.na<-function (x) {sum(x, na.rm=T)}

# Part I. Getting the EUROBAROMETER public opinion data ------------------
#Load the data
data <- as.data.set(spss.system.file("E:/Project MLM and PS/ZA4229_v1-1-0.sav"))
#Get the geographical identifier
data$geoID<-as.factor(substring(as.character(data$v7),1,2))
#Subset the data for the needed countries
data1<-subset(data, geoID=='AT' | geoID=='BG'| geoID=='CY'
              | geoID=='CZ'| geoID=='DE'| geoID=='DK'| geoID=='EE'| geoID=='GR'
              | geoID=='ES'| geoID=='FI'| geoID=='FR'| geoID=='HU'
              | geoID=='IE'| geoID=='IT'| geoID=='LT'| geoID=='LU'
              | geoID=='LV'| geoID=='NL'| geoID=='PL'| geoID=='PT'
              | geoID=='RO'| geoID=='SI'| geoID=='SK'| geoID=='GB')

#outcome variable: immigration importance
data1$trust<-ifelse(data1$v224=='Mentioned', 1,0)


#Individual-level predictors: sex
data1$sex<-recode(as.factor(data1$v670), '"Female"="F";"Male"="M"')
#Individual-level predictors: age, recode to match the census data (see below)
data1$age<-recode(as.factor(data1$v673), ('"15 - 24 years"="Y15-24"; "25 - 34 years"="Y25-34";c("35 - 44 years", "45 - 54 years")="Y35-54";c("55 - 64 years","65 years and older")="Y55-74";else=NA'))
#Individual-level predictors: occupation, recode to match the census data (see below)
data1$work<-recode(as.factor(data1$v674), ('"Unemployed, temporarily not working"="UNE"; 
                                           c("Responsible for ordinary shopping, etc.","Student","Retired, unable to work")="INAC";
                                           else="EMP"'))
#Individual-level predictors: education, recode to match the census data (see below)
data1$edu<-recode(as.factor(data1$v669), (' "No full-time education"="NED";
                                            NA="UNK";
                                            "Up to 14 years"="ED1";
                                            c("15 years", "16 years","17 years")="ED2";
                                            c("18 years", "19 years")="ED3"; 
                                            c("20 years", "21 years")="ED4";
                                            else="ED5_6"                                            
                                                  '))
#State-level predictor: old and new member states
data1$reg<-recode(as.factor(data1$geoID), 'c("AT","BE","DE","DK","GR","ES","FI","FR","IE","IT","NL","PT","SE","GB","LU")=0; else=1')

#Subset the data
data2<-data1[,c('trust','geoID','sex','edu','work', 'age', 'reg', 'v8')]
#Add the additional country-level predictors 
data2<-merge(data2,allvars, by.x="geoID", by.y="geo")
data2$ort<-as.factor(recode(as.factor(data2$geoID), 'c("BG","RO","CY","GR")=1; else=0'))
#Order the data
data2<-data2[order(data2$geoID),]
#Make everything a factor
data2$geoID<-as.factor(data2$geoID)
data2$trust<-as.factor(data2$trust)
data2$sex<-as.factor(data2$sex)
data2$edu<-as.factor(data2$edu)
data2$work<-as.factor(data2$work)
data2$reg<-as.factor(data2$reg)
data2$age<-as.factor(data2$age)
data2$rel.camp<-as.factor(data2$rel.camp)
#scale the numerical variables
data2$gdp<-scale(data2$gdp)
data2$soc<-scale(data2$soc)
data2$gdp.gr<-scale(data2$gdp.gr)

# Part II. Get the real country means and describe variation ------------------
#d_real<-ddply(data2,"geoID",function(X) data.frame(m_real=mean(as.numeric(as.character(X$trust)))))
d_real<-ddply(data2,"geoID",function(X) data.frame(wm_real=weighted.mean(as.numeric(as.character(X$trust)), X$v8),
                                                   m_real=mean(as.numeric(as.character(X$trust)))))
d_real$n<-aggregate(data2$geoID, list(data2$geoID), FUN = length)$x
d_real$error<-sqrt((d_real$m_real-d_real$m_real^2)/d_real$n)
d_real$min<-d_real$m_real-1.96*d_real$error
d_real$max<-d_real$m_real+1.96*d_real$error
d_real$plusone<-d_real$m_real+1*d_real$error
d_real$minusone<-d_real$m_real-1*d_real$error

state.sd_real<-sd(d_real$m_real)
state.wsd_real<-sd(d_real$wm_real)

#state-level r-squared from the country-level model
states<-merge(d_real, allvars, by.x="geoID", by.y='geo')
states$reg<-recode(as.factor(states$geoID), 'c("AT","BE","DE","DK","GR","ES","FI","FR","IE","IT","NL","PT","SE","GB","LU")=0; else=1')
states$ort<-recode(as.factor(states$geoID), 'c("BG","RO","CY","GR")=1; else=0')
states$com<-recode(as.factor(states$geoID), 'c("AT","CY","MT", "BE","DE","DK","GR","ES","FI","FR","IE","IT","NL","PT","SE","GB","LU")=0; else=1')

summary(lm(m_real~rel.camp+ort, data=states)) #Find a good linear model
adj.r2<-summary(lm(m_real~rel.camp+reg, data=states))[9] #get the adj. r-squared from the model

#individual-level model only
summary(t1<-glm(trust~sex+edu+work+age, data=data2, family='binomial'))
mcfr2<-pR2(t1)[4] #McFadden's pseudo-R squared

#across vs. within variation
icc.function<-function(x,y){  #as in Buttice and Highton
  temp<-VarCorr(lmer(x~(1|y)))
  attributes(temp$y)$stddev^2/(attributes(temp)$sc^2+attributes(temp$y)$stddev^2)
}
icc<-icc.function(data2$trust,data2$geoID) #Note that a linear version of the model is used


# Part III. Getting the CENSUS data from EUROSTAT ------------------
#Get the relevant table from the 2001 census
cen<-getEurostatRCV(kod = "cens_01news")
#Filter for the relevant countries
cen.g<-subset(cen, geo=='AT '| geo=='BE '| geo=='BG '| geo=='CY '
                 | geo=='CZ '| geo=='DE '| geo=='DK '| geo=='EE '| geo=='EL '
                 | geo=='ES '| geo=='FI '| geo=='FR '| geo=='HU '
                 | geo=='IE '| geo=='IT '| geo=='LT '| geo=='LU '
                 | geo=='LV '| geo=='MT '| geo=='NL '| geo=='PL '| geo=='PT '
                 | geo=='RO '| geo=='SE '| geo=='SI '| geo=='SK '| geo=='UK ')
#Recode the countries to match
cen.g$geo<-recode(cen.g$geo, '"UK "="GB "; "EL "="GR "')
###Filter to get the totals out of the table
cen.gs<-subset(cen.g, sex!="T")
cen.gse<-subset(cen.gs, isced97=='ED0' | isced97=='ED1' | isced97=='ED2'| isced97=='ED3' 
                | isced97=='ED4' | isced97=='ED5_6' | isced97=='NED' | isced97=='UNK')
cen.gsew<-subset(cen.gse, wstatus=='EMP' | wstatus=='INAC' | wstatus=='UNE' 
                 | wstatus=='NAP' | wstatus=='UNK')
cen.gsewa<-subset(cen.gsew, age!="TOTAL")
p<-droplevels(cen.gsewa)
#Get a unique identifier
p$full<-paste(sep="_",p$wstatus,p$age,p$sex,p$geo)
#Combine ED0 and ED1
for (i in 1:nrow(p)){
  if (p$isced97[i]=='ED0' & is.na(p$value[i])==F) {
    for (j in 1:nrow(p)){
      if (p$full[i]==p$full[j] & p$isced97[j]=='ED1'){
        p$value[j]<-p$value[i]+p$value[j]
      } else next  } } else next }
#New unique identifier
p$full1<-paste(sep="_",p$isced97,p$age,p$sex,p$geo)
#Combine NAP and INAC
for (i in 1:nrow(p)){
  if (p$wstatus[i]=='NAP' & is.na(p$value[i])==F) {
    for (j in 1:nrow(p)){
      if (p$full1[i]==p$full1[j] & p$wstatus[j]=='INAC'){
        p$value[j]<-p$value[i]+p$value[j]
      } else next } } else next }
#Combine UNK and INAC
for (i in 1:nrow(p)){
  if (p$wstatus[i]=='UNK' & is.na(p$value[i])==F) {
    for (j in 1:nrow(p)){
      if (p$full1[i]==p$full1[j] & p$wstatus[j]=='INAC'){
        p$value[j]<-p$value[i]+p$value[j]
      } else next } }  else   next }
#Remove the no longer necessary lines
p<-subset(p,p$wstatus!="UNK")
p<-subset(p,p$wstatus!="NAP")
p<-subset(p,p$isced97!="ED0")
p<-droplevels(p)
#calculate the population share
p$share<-NA
u<-unique(p$geo)
for (i in u) {
  p.s<-subset(p,p$geo==i)
  c.sum<-sum(p.s$value, na.rm=T)
  for (j in 1:nrow(p)) {
  if (p$geo[j]==i)
  p$share[j]<-p$value[j]/c.sum
  else next  } }
#Check
for (i in u){p.s<-subset(p,p$geo==i)
  print(i)
  print(sum.na(p.s$share))}
#Get only the needed columns
popul<-p[,c(1,2,3,4,7,11)]
#Order
popul<-popul[order(popul$geo,popul$age,popul$sex,popul$isced97,popul$wstatus),]
#rename for ease
colnames(popul)<-c("work","edu","age","sex","geoID","share")
#strip the country strings
popul$geoID<-as.factor(strstrip(as.character(popul$geoID)))
#add the regional variable
popul$reg<-recode(popul$geoID, 'c("AT","BE","DE","DK","GR","ES","FI","FR","IE","IT","NL","PT","SE","GB","LU")=0; else=1')
popul$ort<-recode(as.factor(popul$geoID), 'c("BG","RO","CY","GR")=1; else=0')
#summary

summary(popul)
#save the data 
write.table(popul, 'E:/popul.txt')
#read the file
popul<-read.table('E:/popul.txt')
allvars$gdp<-scale(allvars$gdp)
popul<-merge(popul, allvars, by.x="geoID", by.y="geo") #add the country level predictors
popul$rel.camp<-recode(popul$rel.camp, '"cat"=0; else=1') #recode the categorical variable

# Part IV. Get a sample from the Eurobarometer data ------------------------------------------------
#Just in case
data2<-data2[order(data2$geoID),]
#Get the country weights
population <- getEurostatRCV(kod = "demo_gind")
population<-subset(population, indic_de=='AVG' & as.numeric(as.character(time))=='2001')
population<-subset(population, geo=='AT'| geo=='BG'| geo=='CY'
              | geo=='CZ'| geo=='DE_TOT'| geo=='DK'| geo=='EE'| geo=='EL'
              | geo=='ES'| geo=='FI'| geo=='FR'| geo=='HU'
              | geo=='IE'| geo=='IT'| geo=='LT'| geo=='LU'
              | geo=='LV'| geo=='NL'| geo=='PL'| geo=='PT'
              | geo=='RO'| geo=='SI'| geo=='SK'| geo=='UK')
#Recode the countries to match
population$geo<-recode(population$geo, '"UK"="GB"; "DE_TOT"="DE";"EL"="GR"')
#Calculate the proportion
population$prop<-population$value/sum(population$value)
#order the data
population<-population[order(population$geo),]
#override the proportion with an equal share
population$prop<-24/1500

#start the loop
#prepare the storage matrixes
#1. Naive disaggregation
naives<-as.data.frame(matrix(NA,24,101))
naives[,1]<-population$geo
#2. Random effects for geoID only
re0<-as.data.frame(matrix(NA,24,101))
re0[,1]<-population$geo
#3. Random effects for geoID and country-level predictors
mrp0<-as.data.frame(matrix(NA,24,101))
mrp0[,1]<-population$geo
#4. Ranomd effects for geoID, country-level predictors and individual-level random effects
mrp<-as.data.frame(matrix(NA,24,101))
mrp[,1]<-population$geo
#5. Collect model summaries
effects<-as.data.frame(matrix(NA,100, 2+5+1,))
colnames(effects)<-c("prot","reg",
                     "geoID","edu","age","work","sex",
                     "deviance")


for (i in 2:101){
s<-strata(data2, c('geoID'), method='srswor', size=round(population$prop*1500, digits=0))
sample.data<-getdata(data2,s)
#sample.data<-data2 #override subsampling for tests
#Sample means by country
d_naive<-ddply(sample.data,"geoID",function(X) data.frame(m_naive=mean(as.numeric(as.character(X$trust)))))
naives[,i]<-d_naive$m_naive

# Part V. Fit the multi-level regression models ------------------------------------------------
m<-glmer(trust~(1|geoID)+(1|age)+(1|sex)+(1|edu)+(1|work)+rel.camp+reg, family=binomial(link="logit"), data=sample.data)
#display(m)
m0<-glmer(trust~(1|geoID)+rel.camp+reg, family=binomial(link="logit"), data=sample.data)
mre0<-glmer(trust~(1|geoID), family=binomial(link="logit"), data=sample.data)

#record the full model estimates
effects[i-1,1]<-fixef(m)[2]
effects[i-1,2]<-fixef(m)[3]
effects[i-1,3]<-attributes(VarCorr(m)[[1]])$stddev
effects[i-1,4]<-attributes(VarCorr(m)[[2]])$stddev
effects[i-1,5]<-attributes(VarCorr(m)[[3]])$stddev
effects[i-1,6]<-attributes(VarCorr(m)[[4]])$stddev
effects[i-1,7]<-attributes(VarCorr(m)[[5]])$stddev
effects[i-1,8]<-summary(m)$dev[[1]][8]

#Obtain the predictions for the full model
popul$cellpred <- invlogit(fixef(m)["(Intercept)"]
                     +ranef(m)$geoID[as.character(popul$geoID),1]
                     +ranef(m)$age[as.character(popul$age),1]
                     +ranef(m)$sex[as.character(popul$sex),1]
                     +ranef(m)$edu[as.character(popul$edu),1]
                     +ranef(m)$work[as.character(popul$work),1]
                     + fixef(m)["rel.campprot"]*as.numeric(as.character(popul$rel.camp))
                     + fixef(m)["reg1"]*as.numeric(as.character(popul$reg))
                     )
#Calculate the weighted prediction          
popul$wpred<-popul$share*popul$cellpred
#Get the state predictions
statepred <- as.vector(tapply(popul$wpred,popul$geoID,sum.na))
mrp[,i]<-statepred

#Obtain the predictions for the model with country level predictors only
popul$cellpred0 <- invlogit(fixef(m0)["(Intercept)"]
                           +ranef(m0)$geoID[as.character(popul$geoID),1]
                            + fixef(m)["rel.campprot"]*as.numeric(as.character(popul$rel.camp))
                            + fixef(m)["reg1"]*as.numeric(as.character(popul$reg))
                                                        )
#Calculate the weighted prediction          
popul$wpred0<-popul$share*popul$cellpred0
#Get the state predictions
statepred0 <- as.vector(tapply(popul$wpred0,popul$geoID,sum.na))
mrp0[,i]<-statepred0

#Obtain the predictions for the model with country level random effects only
popul$cellpredre0 <- invlogit(fixef(mre0)["(Intercept)"]
                            +ranef(mre0)$geoID[as.character(popul$geoID),1]
                            )
#Calculate the weighted prediction          
popul$wpredre0<-popul$share*popul$cellpredre0
#Get the state predictions
statepredre0 <- as.vector(tapply(popul$wpredre0,popul$geoID,sum.na))
re0[,i]<-statepredre0
}

# Part VI. Evaluate the predictions ------------------------------------------------
results<-as.data.frame(matrix(NA,4*4,3))
colnames(results)<-c("mean","median","stdev")
rownames(results)<-c("rho.naive","rho.geo","rho.geoplus","rho.full",
                     "tau.naive","tau.geo","tau.geoplus","tau.full",
                     "mae.naive","mae.geo","mae.geoplus","mae.full",
                     "rmse.naive","rmse.geo","rmse.geoplus","rmse.full")

#correlations
results["rho.naive",]<-c(mean(array(cor(d_real$m_real,naives[,-1]))),
                        median(array(cor(d_real$m_real,naives[,-1]))),
                        sd(array(cor(d_real$m_real,naives[,-1]))))
results["rho.geo",]<-c(mean(array(cor(d_real$m_real,re0[,-1]))),
                         median(array(cor(d_real$m_real,re0[,-1]))),
                         sd(array(cor(d_real$m_real,re0[,-1]))))
results["rho.geoplus",]<-c(mean(array(cor(d_real$m_real,mrp0[,-1]))),
                         median(array(cor(d_real$m_real,mrp0[,-1]))),
                         sd(array(cor(d_real$m_real,mrp0[,-1]))))
results["rho.full",]<-c(mean(array(cor(d_real$m_real,mrp[,-1]))),
                         median(array(cor(d_real$m_real,mrp[,-1]))),
                         sd(array(cor(d_real$m_real,mrp[,-1]))))

#Kendal's tau
t1<-rep(NA,100)
t2<-rep(NA,100)
t3<-rep(NA,100)
t4<-rep(NA,100)
for (i in 2:101) (t1[i-1]<-Kendall (d_real$m_real,naives[,i])$tau[1]) #sl for different measure
for (i in 2:101) (t2[i-1]<-Kendall (d_real$m_real,re0[,i])$tau[1])
for (i in 2:101) (t3[i-1]<-Kendall (d_real$m_real,mrp0[,i])$tau[1])
for (i in 2:101) (t4[i-1]<-Kendall (d_real$m_real,mrp[,i])$tau[1])
results["tau.naive",]<-c(mean(t1),median(t1),sd(t1))
results["tau.geo",]<-c(mean(t2),median(t2),sd(t2))
results["tau.geoplus",]<-c(mean(t3),median(t3),sd(t3))
results["tau.full",]<-c(mean(t4),median(t4),sd(t4))

#RMSE
results["rmse.naive",]<-c(mean(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))),
                          median(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))),
                          sd(array(apply(naives[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.geo",]<-c(mean(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))),
                        median(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))),
                        sd(array(apply(re0[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.geoplus",]<-c(mean(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))),
                            median(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))),
                            sd(array(apply(mrp0[,-1], 2 , "rmse" , d_real$m_real ))))
results["rmse.full",]<-c(mean(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))),
                         median(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))),
                         sd(array(apply(mrp[,-1], 2 , "rmse" , d_real$m_real ))))
#MAE
results["mae.naive",]<-c(mean(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))),
                          median(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))),
                          sd(array(apply(naives[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.geo",]<-c(mean(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))),
                        median(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))),
                        sd(array(apply(re0[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.geoplus",]<-c(mean(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))),
                            median(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))),
                            sd(array(apply(mrp0[,-1], 2 , "mae" , d_real$m_real ))))
results["mae.full",]<-c(mean(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))),
                         median(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))),
                         sd(array(apply(mrp[,-1], 2 , "mae" , d_real$m_real ))))
results

##Check ow may of the MRP-estimates are within  the 50 and within the 95 confidence intervals
coverage<-as.data.frame(matrix(NA,100,3))
colnames(coverage)<-c("95CI","50CI", "cor")
cov.state<-as.data.frame(matrix(0,24,3))
colnames(cov.state)<-c("95CI","50CI", "geo")
cov.state$geo<-d_real$geo
for (i in 1:100){
  temp<-matrix(NA,24,2)
  for (j in 1:24){
  ifelse(mrp[j,i+1]>d_real$max[j] | mrp[j,i+1]<d_real$min[j], temp[j,1]<-0 ,temp[j,1]<-1)
  ifelse(mrp[j,i+1]>d_real$plusone[j] | mrp[j,i+1]<d_real$minusone[j], temp[j,2]<-0 ,temp[j,2]<-1)
  cov.state [j,1]<-cov.state [j,1]+temp[j,1]
  cov.state [j,2]<-cov.state [j,2]+temp[j,2]
  }
  coverage[i,1]<-sum(temp[,1])
  coverage[i,2]<-sum(temp[,2])
  coverage[i,3]<-round(cor(d_real$m_real,mrp[,i+1]), digits=2)
  
}
coverage
cov.state


#Write the tables of results
write.table(coverage, "E:/gender_coverage_equal_noweight.txt")
write.table(cov.state, "E:/gender_covered_states_equal_noweight.txt")
write.table(results, "E:/gender_results_equal_noweight.txt")
write.table(effects, "E:/gender_effects_equal_noweight.txt")


###The Figure
d_temp<-d_real
d_temp$m_real<-d_real$m_real+0.0001
d_real$index<-"A"
d_temp$index<-"B"
d_real$m<-d_real$m_real
d_temp$m<-rowMeans(mrp[-1])

for (i in 1:nrow(mrp)){
  d_temp$min[i]<-quantile(as.numeric(mrp[i,-1]), prob=0.05)
  d_temp$max[i]<-quantile(as.numeric(mrp[i,-1]), prob=0.95)
}

d_temp$geoID<-paste(sep=".", as.character(d_temp$geoID), "1")

d_empty<-d_temp
d_empty$geoID<-paste(sep=".", as.character(d_real$geoID), "2")
d_empty$m_real<-d_real$m_real-0.00001
d_empty$index<-"C"

dd<-rbind(d_real,d_temp, d_empty)
dd<-dd[order(-dd$m_real),]
dd$geoID <- factor(dd$geoID, as.character(dd$geoID))

plot1 <- ggplot(dd, aes(m, geoID, group=geoID, colour=index)) + geom_point(size = 3.5) + ylab("")+
  xlab('Gender equality') +
  geom_segment(aes(x=min, xend=max, yend=geoID))+theme_bw()
plot2<-plot1+scale_y_discrete(labels=dd$geoID[c(seq(2,72, by=3))], breaks=dd$geoID[c(seq(2,72, by=3))])+
  theme(legend.position=c(.8, 0.9)) +
  scale_x_continuous(labels=percent)+
  scale_colour_manual(name="State means", values = c("black","red", "white"), labels=c("  Real (full sample)", "  MRP-based", ""))

pdf(file='E:/Q8_equal_noweight.pdf', width=11,height=9)
plot2
dev.off()
